# ------------------------------
# ' recreate "sameness" index at PUMA or county levels
# ' this code can create yearly measures as well as overall measures
# ------------------------------

# custom function to measure SD for weighted sd
weighted.sd <- function(x, w = NULL, na.rm = FALSE) {
	if (na.rm) {
		na <- is.na(x) | is.na(w)
		x <- x[!na]
		w <- w[!na]
	}
	weighted_sd = sqrt(sum(w * (x - weighted.mean(x, w)) ^ 2) / (sum(w) - 1))
	return(weighted_sd)
}

#------------------------------------------------------------------------------
# load processed data : 
rdata = read_fst(file.path(processed_path,paste0('merged_county_v1.fst')), as.data.table=TRUE)

#------------------------------------------------------------------------------
# Data Preprocessing
rdata[, Race5 := Race4]
rdata[Hisp == 1, Race5 := 5]

# drop other race categories
rdata[Race5 == 9, Race5 := NA] 

rdata[, MarStat5 := MarStat6]
rdata[MarStat5 == 6, MarStat5 := 5]

# drop younger than 15
rdata = rdata[AgeGrp4 != 0,]

rdata[, Suic := DSID]
rdata[, St := state]

# drop when state indicator is missing
rdata = rdata[!is.na(St),]

#==============================================================================
list_var = c('Sex','AgeGrp4','Race5','MarStat5','BornUSA','UnEmpl', 'PhysProb','county','Year')
ind_rdata = rdata[,c('uid',list_var,'Suic'),with=FALSE]

# check missing patterns -- looks okay
md.pattern(ind_rdata)

# start with nonmissing observations on sex, agegroup, race, bornusa, county, Year 
ind_rdata = ind_rdata[!is.na(Sex) & !is.na(AgeGrp4) & !is.na(Race5) & !is.na(BornUSA) & !is.na(county) & !is.na(Year),]

for (var in list_var) {
	ind_rdata[[var]] = as.factor(ind_rdata[[var]])
}
rm(rdata);gc()

#==============================================================================
# multiple imputation using MICE 
# impute within each county each year
county_year = unique(ind_rdata[,c('county','Year')])
list_county = unique(ind_rdata[,county])
list_year = unique(ind_rdata[,Year])

impute_missing = function(tmp, m_set=10){
	imp = mice(tmp, m = m_set, seed = 12345)
	imputed_imp = data.table(complete(imp,"long"))
	return(imputed_imp)
}

# create imputation directory
dir.create(file.path(processed_path,'imputed_by_year'))

for (yy in list_year){	
	outfile = file.path(processed_path,'imputed_by_year',paste0('imputed_M10_',yy,'.fst'))
	if (!file.exists(outfile)){
		message('=== NOW saving files for ',yy)

		year_tmp = ind_rdata[Year==yy,]
		year_tmp = year_tmp[, .(list(data.table(impute_missing(.SD),county))), by='county']
		imputed_imp = rbindlist(year_tmp[,V1])
		write_fst(imputed_imp, outfile, 100)
	}
	message('=== done files for ',yy)
}


list_imputed = list(); n = 1
for (yy in 2005:2011){	
	outfile = file.path(processed_path,'imputed_by_year',paste0('imputed_M10_',yy,'.fst'))
	list_imputed[[n]] = read_fst(outfile)
	n = n + 1
}

list_imputed = rbindlist(list_imputed)
list_imputed[, .id := NULL]
setnames(list_imputed,'.imp','m')

for (mm in 1:10){
	outfile = file.path(processed_path,paste0('imputed_M',mm,'_v1','.fst'))
	write_fst(list_imputed[m==mm,],outfile,100)
}


